In [1]:
# Author : Paul-Antoine Nguyen

# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold

# some overhead because of kernel memory limits

import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import gc
from tqdm import tqdm, tqdm_notebook

tqdm.pandas(desc="")
tqdm_notebook().pandas(desc="")

%load_ext ipycache

IDIR = 'input/'


print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv')
print('loading train')
op_train = pd.read_csv(IDIR + 'order_products__train.csv')
print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv')
print('loading products')
products = pd.read_csv(IDIR + 'products.csv')

departments = pd.read_csv(IDIR + 'departments.csv', engine='c')
aisles = pd.read_csv(IDIR + 'aisles.csv', engine='c')

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(op_train.shape, ', '.join(op_train.columns)))
print('Total departments: {}'.format(departments.shape[0]))
print('Total aisles: {}'.format(aisles.shape[0]))


/home/ubuntu/.venv/local/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/ubuntu/.venv/local/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated since IPython 4.0. You should import from traitlets.config instead.
  "You should import from traitlets.config instead.", ShimWarning)
/home/ubuntu/.venv/local/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.
  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")
loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered
Total departments: 21
Total aisles: 134

In [2]:
orders.order_dow = orders.order_dow.astype(np.int8)
orders.order_hour_of_day = orders.order_hour_of_day.astype(np.int8)
orders.order_number = orders.order_number.astype(np.int16)
orders.order_id = orders.order_id.astype(np.int32)
orders.user_id = orders.user_id.astype(np.int32)
orders.days_since_prior_order = orders.days_since_prior_order.astype(np.float32)
orders.set_index('order_id', inplace=True, drop=False)

products.drop(['product_name'], axis=1, inplace=True)
products.aisle_id = products.aisle_id.astype(np.int8)
products.department_id = products.department_id.astype(np.int8)
products.product_id = products.product_id.astype(np.int32)

op_train.reordered = op_train.reordered.astype(np.int8)
op_train.add_to_cart_order = op_train.add_to_cart_order.astype(np.int16)
op_train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

priors.order_id = priors.order_id.astype(np.int32)
priors.add_to_cart_order = priors.add_to_cart_order.astype(np.int16)
priors.reordered = priors.reordered.astype(np.int8)
priors.product_id = priors.product_id.astype(np.int32)

Features

https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/35468

Here are some feature ideas that can help new participants get started and may be you will find something you have missed:


In [3]:
priors = priors.join(orders, on='order_id', rsuffix='_')
priors = priors.join(products, on='product_id', rsuffix='_')
priors.drop(['product_id_', 'order_id_'], inplace=True, axis=1)

Product

  • users
  • orders
  • order frequency
  • reorder rate
  • recency
  • mean/std add_to_cart_order
  • etc.

In [4]:
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.float32)
prods['order_freq'] = prods['orders'] / len(priors.order_id.unique())
prods['users'] = priors.groupby(priors.product_id).user_id.unique().apply(len)
prods['add_to_cart_order_mean'] = priors.groupby(priors.product_id).add_to_cart_order.mean()
prods['add_to_cart_order_std'] = priors.groupby(priors.product_id).add_to_cart_order.std()

prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)

products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods

User

  • Products purchased
  • Orders made
  • frequency and recency of orders
  • Aisle purchased from
  • Department purchased from
  • frequency and recency of reorders
  • tenure
  • mean order size
  • etc.

In [5]:
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr["period"] = orders.groupby('user_id').days_since_prior_order.fillna(0).sum()
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)

users = users.join(usr)
del usr
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
gc.collect()
print('user f', users.shape)


Out[5]:
42
user f (206209, 7)

Aisle

  • users
  • orders
  • order frequency
  • reorder rate
  • recency
  • mean add_to_cart_order
  • etc.

Department

  • users
  • orders
  • order frequency
  • reorder rate
  • recency
  • mean add_to_cart_order
  • etc.

User Product Interaction (UP)

  • purchases
  • reorders
  • day since last purchase
  • order since last purchase
  • etc.

In [9]:
# %%cache userXproduct.pkl userXproduct
priors['user_product'] = priors.product_id + priors.user_id * 100000

d = dict()
for row in tqdm(priors.itertuples(), total=len(priors)):
    z = row.user_product
    if z not in d:
        d[z] = (
            1,
            (row.order_number, row.order_id),
            row.add_to_cart_order,
            row.reordered
        )
    else:
        d[z] = (
            d[z][0] + 1,
            max(d[z][1], (row.order_number, row.order_id)),
            d[z][2] + row.add_to_cart_order,
            d[z][3] + row.reordered
        )

print('to dataframe (less memory)')
d = pd.DataFrame.from_dict(d, orient='index')
d.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart', 'reorders']
d.nb_orders = d.nb_orders.astype(np.int16)
d.last_order_id = d.last_order_id.map(lambda x: x[1]).astype(np.int32)
d.sum_pos_in_cart = d.sum_pos_in_cart.astype(np.int16)
   
userXproduct = d
print('user X product f', len(userXproduct))


100%|██████████| 32434489/32434489 [03:43<00:00, 144874.10it/s]
to dataframe (less memory)
user X product f 13293564

User aisle interaction (UA)

  • purchases
  • reorders
  • day since last purchase
  • order since last purchase
  • etc.

User department interaction (UD)

  • purchases
  • reorders
  • day since last purchase
  • order since last purchase
  • etc.

User time interaction (UT)

  • user preferred day of week
  • user preferred time of day
  • similar features for products and aisles

Combine


In [7]:
### build list of candidate products to reorder, with features ###
train_index = set(op_train.index)

def features(selected_orders, labels_given=False):
    order_list = []
    product_list = []
    labels = []
    for row in tqdm(selected_orders.itertuples(), total=len(selected_orders)):
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train_index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list})
    df.order_id = df.order_id.astype(np.int32)
    df.product_id = df.product_id.astype(np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id).astype(np.int32)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    df['user_period'] =  df.user_id.map(users.period)
    
    print('order related features')
    # df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id).astype(np.int8)
    df['department_id'] = df.product_id.map(products.department_id).astype(np.int8)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.float32)
    df['product_users'] = df.product_id.map(products.users).astype(np.float32)
    df['product_order_freq'] = df.product_id.map(products.order_freq).astype(np.float32)
    df['product_reorders'] = df.product_id.map(products.reorders).astype(np.float32)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print('user_X_product related features')
    df['z'] = df.product_id + df.user_id * 100000
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)

    df['UP_reorders'] = df.z.map(userXproduct.reorders)

    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
                  df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    
#     df['UP_days_past_last_buy'] = 
    #df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
    #                                              df.order_id.map(orders.order_dow)

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)

    gc.collect()
    return (df, labels)

In [10]:
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

df_train, labels = features(train_orders, labels_given=True)
df_test, _ = features(test_orders)


split orders : train, test
100%|██████████| 131209/131209 [00:12<00:00, 10906.86it/s]
user related features
order related features
product related features
user_X_product related features
100%|██████████| 75000/75000 [00:02<00:00, 32167.53it/s]
user related features
order related features
product related features
user_X_product related features

Train


In [11]:
f_to_use = [
    'user_total_orders', 'user_total_items', 'user_total_distinct_items',
    'user_average_days_between_orders', 'user_average_basket',
    'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
    'aisle_id', 'department_id', 'product_orders', 'product_reorders',
    'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
    'UP_average_pos_in_cart', 'UP_reorders', 'UP_orders_since_last',
    'UP_delta_hour_vs_last'
]

def feature_select(df):
    return df.drop(["user_id", "order_id", "product_id"], axis=1, errors="ignore")

In [12]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 98

def train(traindf, y):
    d_train = lgb.Dataset(
        feature_select(traindf),
        label=y,
        categorical_feature=['aisle_id', 'department_id']
    )

    model = lgb.train(params, d_train, ROUNDS)
    return model

In [13]:
model = train(df_train, labels)

Predict


In [17]:
def predict(model, df_test, TRESHOLD=0.19, predicted_basket_size=None):
    ### build candidates list for test ###

    df_test['pred'] = model.predict(feature_select(df_test))

    d = dict()
    if not predicted_basket_size:
        for row in df_test.itertuples():
            if row.pred > TRESHOLD:
                try:
                    d[row.order_id] += ' ' + str(row.product_id)
                except KeyError:
                    d[row.order_id] = str(row.product_id)
    else:
        # Вот тут можно отрезать не по threshold, а с помощью модели определять кол-во покупок
        current_order_id = None
        current_order_count = 0
        for row in df_test.sort_values(
            by=["order_id", "pred"], 
            ascending=[False, False]
        ).itertuples():
            order_id = row.order_id
            if order_id != current_order_id:
                current_order_id = order_id
                current_order_count = 0
            if current_order_count >= predicted_basket_size[current_order_id]:
                continue
            current_order_count += 1
            try:
                d[order_id] += ' ' + str(row.product_id)
            except KeyError:
                d[order_id] = str(row.product_id)

    for order_id in df_test.order_id:
        if order_id not in d:
            d[order_id] = 'None'

    sub = pd.DataFrame.from_dict(d, orient='index')
    sub.reset_index(inplace=True)
    sub.columns = ['order_id', 'products']
    return sub

In [18]:
# Загружаем предсказанное кол-во покупок
predicted_basket_size = pd.read_csv("test_orders_products_count.csv", index_col="order_id")
predicted_basket_size = predicted_basket_size["pred_products_count"].to_dict()

In [19]:
sub = predict(model, df_test, predicted_basket_size=predicted_basket_size)
sub.to_csv('sub.csv', index=False)

CV


In [ ]:
lgb.cv(params, d_train, ROUNDS, nfold=5, verbose_eval=10)

In [12]:
%%cache df_train_gt.pkl df_train_gt

from functools import partial

products_raw = pd.read_csv(IDIR + 'products.csv')
# combine aisles, departments and products (left joined to products)
goods = pd.merge(left=pd.merge(left=products_raw, right=departments, how='left'), right=aisles, how='left')
# to retain '-' and make product names more "standard"
goods.product_name = goods.product_name.str.replace(' ', '_').str.lower() 

# retype goods to reduce memory usage
goods.product_id = goods.product_id.astype(np.int32)
goods.aisle_id = goods.aisle_id.astype(np.int16)
goods.department_id = goods.department_id.astype(np.int8)

# initialize it with train dataset
train_details = pd.merge(
                left=op_train,
                 right=orders, 
                 how='left', 
                 on='order_id'
        ).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))

# add order hierarchy
train_details = pd.merge(
                left=train_details,
                right=goods[['product_id', 
                             'aisle_id', 
                             'department_id']].apply(partial(pd.to_numeric, 
                                                             errors='ignore', 
                                                             downcast='integer')),
                how='left',
                on='product_id'
)

train_gtl = []

for uid, subset in train_details.groupby('user_id'):
    subset1 = subset[subset.reordered == 1]
    oid = subset.order_id.values[0]

    if len(subset1) == 0:
        train_gtl.append((oid, 'None'))
        continue

    ostr = ' '.join([str(int(e)) for e in subset1.product_id.values])
    # .strip is needed because join can have a padding space at the end
    train_gtl.append((oid, ostr.strip()))

del train_details
del goods
del products_raw

gc.collect()

df_train_gt = pd.DataFrame(train_gtl)

df_train_gt.columns = ['order_id', 'products']
df_train_gt.set_index('order_id', inplace=True)
df_train_gt.sort_index(inplace=True)


[Skipped the cell's code and loaded variables df_train_gt from file '/Users/evgeny/PycharmProjects/kaggle/instacart/df_train_gt.pkl'.]

In [14]:
from sklearn.model_selection import GroupKFold

def f1_score(cvpred):
    joined = df_train_gt.join(cvpred, rsuffix="_cv", how="inner")
    lgts = joined.products.replace("None", "-1").apply(lambda x: x.split(" ")).values
    lpreds = joined.products_cv.replace("None", "-1").apply(lambda x: x.split(" ")).values
    f1 = []
    for lgt, lpred in zip(lgts, lpreds):
        rr = (np.intersect1d(lgt, lpred))
        precision = np.float(len(rr)) / len(lpred)
        recall = np.float(len(rr)) / len(lgt)

        denom = precision + recall
        f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)
    return np.mean(f1)

def cv(threshold=0.22):
    gkf = GroupKFold(n_splits=5)

    scores = []
    for train_idx, test_idx in gkf.split(df_train.index, groups=df_train.user_id):
        dftrain = df_train.iloc[train_idx]
        dftest = df_train.iloc[test_idx]
        y = labels[train_idx]
        model = train(dftrain, y)
        pred = predict(model, dftest, threshold).set_index("order_id")
        f1 = f1_score(pred)
        print f1
        scores.append(f1)
        del dftrain
        del dftest
        gc.collect()

    return np.mean(scores), np.std(scores)

In [ ]:
cv()

In [18]:
for th in np.arange(0.18, 0.22, 0.01):
    print th
    print cv(threshold=th)
    print


0.18
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
0.375669602808
0.37518960199
0.376068733519
0.374880658158
0.371575669134
(0.37467685312194482, 0.0016027896306283745)

0.19
0.375981281546
0.375613273106
0.37623495823
0.374958453045
0.371884026622
(0.3749343985097483, 0.0015845275427144021)

0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351
(0.37491390084571824, 0.001620734287706205)

0.21
0.375454836995
0.374657579102
0.375585106194
0.374639123067
0.371277685501
(0.37432286617177202, 0.0015722458019732746)

0.22
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-18-6c718121d7c6> in <module>()
      1 for th in np.arange(0.18, 0.22, 0.01):
      2     print th
----> 3     print cv(threshold=th)
      4     print

<ipython-input-14-8a57b9000d45> in cv(threshold)
      8         dftest = df_train.iloc[test_idx]
      9         y = labels[train_idx]
---> 10         model = train(dftrain, y)
     11         pred = predict(model, dftest, threshold).set_index("order_id")
     12         f1 = f1_score(pred)

<ipython-input-10-32f13b5c7596> in train(traindf, y)
     18     )
     19 
---> 20     model = lgb.train(params, d_train, ROUNDS)
     21     return model

/usr/local/lib/python2.7/site-packages/lightgbm-0.2-py2.7.egg/lightgbm/engine.py in train(params, train_set, num_boost_round, valid_sets, valid_names, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, evals_result, verbose_eval, learning_rates, callbacks)
    178                                     evaluation_result_list=None))
    179 
--> 180         booster.update(fobj=fobj)
    181 
    182         evaluation_result_list = []

/usr/local/lib/python2.7/site-packages/lightgbm-0.2-py2.7.egg/lightgbm/basic.py in update(self, train_set, fobj)
   1368             _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
   1369                 self.handle,
-> 1370                 ctypes.byref(is_finished)))
   1371             self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
   1372             return is_finished.value == 1

KeyboardInterrupt: 

0.372658477911


In [17]:
0.18
0.375669602808
0.37518960199
0.376068733519
0.374880658158
0.371575669134
(0.37467685312194482, 0.0016027896306283745)

0.19
0.375981281546
0.375613273106
0.37623495823
0.374958453045
0.371884026622
(0.3749343985097483, 0.0015845275427144021)

0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351
(0.37491390084571824, 0.001620734287706205)


0.21
0.375454836995
0.374657579102
0.375585106194
0.374639123067
0.371277685501
(0.37432286617177202, 0.0015722458019732746)


0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351

(0.37491390084571824, 0.001620734287706205)

0.374504880043
0.372459365153
0.374241429517
0.373332070018
0.370178093483
(0.37294316764289259, 0.0015591904647740879) 0.22
0.370290530162
0.369518178297
0.370515696117
0.369568282123
0.3673846793
(0.36945547319979183, 0.0011069090226251931) 0.24
0.363691285892
0.363725106289
0.363492700824
0.364412180878
0.363024994542
(0.36366925368510306, 0.00044761289123321511) 0.26


  File "<ipython-input-17-fd741fd5103b>", line 6
    (0.37491390084571824, 0.001620734287706205) 0.2
                                                  ^
SyntaxError: invalid syntax

Модель определения кол-ва reordered


In [6]:
prior_orders_count = priors[["order_id", "reordered"]].groupby("order_id").sum()
prior_orders_count = prior_orders_count.rename(columns={"reordered": "product_counts"})

train_orders_count = op_train.drop(["product_id", "order_id"], axis=1, errors="ignore")
train_orders_count = train_orders_count.reset_index()[["order_id", "reordered"]].groupby("order_id").sum()
train_orders_count = train_orders_count.rename(columns={"reordered": "product_counts"})

prior_orders_count = orders.join(prior_orders_count, how='inner')
train_orders_count = orders.join(train_orders_count, how='inner')

def extend_prev_prod_count(df, period=1):
    global prior_orders_count
    prior_orders_count["next_order_number"] = prior_orders_count["order_number"] + period
    mdf = prior_orders_count[["user_id", "next_order_number", "product_counts"]]
    mdf = mdf.add_suffix("_prev%s" % period)
    try:
        return df.merge(
            mdf,
            left_on=["user_id", "order_number"], 
            right_on=["user_id_prev%s" % period, "next_order_number_prev%s" % period],
            how="left",
        ).drop([
            "next_order_number",
            "next_order_number_prev%s"  % period,
            "user_id_prev%s"  % period,
        ], axis=1, errors="ignore")
    finally:
        prior_orders_count.drop("next_order_number", axis=1, inplace=True)

train_orders_count = extend_prev_prod_count(train_orders_count, 1)
train_orders_count = extend_prev_prod_count(train_orders_count, 2)
prior_orders_count = extend_prev_prod_count(prior_orders_count, 1)
prior_orders_count = extend_prev_prod_count(prior_orders_count, 2)

prior_orders_count.head(15)


Out[6]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order product_counts product_counts_prev1 product_counts_prev2
0 2539329 1 prior 1 2 8 NaN 0.0 NaN NaN
1 2398795 1 prior 2 3 7 15.0 3.0 0.0 NaN
2 473747 1 prior 3 3 12 21.0 3.0 3.0 0.0
3 2254736 1 prior 4 4 7 29.0 5.0 3.0 3.0
4 431534 1 prior 5 4 15 28.0 5.0 5.0 3.0
5 3367565 1 prior 6 2 7 19.0 4.0 5.0 5.0
6 550135 1 prior 7 1 9 20.0 5.0 4.0 5.0
7 3108588 1 prior 8 1 14 14.0 4.0 5.0 4.0
8 2295261 1 prior 9 1 16 0.0 6.0 4.0 5.0
9 2550362 1 prior 10 4 8 30.0 6.0 6.0 4.0
10 2168274 2 prior 1 2 11 NaN 0.0 NaN NaN
11 1501582 2 prior 2 5 10 10.0 1.0 0.0 NaN
12 1901567 2 prior 3 1 10 3.0 3.0 1.0 0.0
13 738281 2 prior 4 2 10 8.0 1.0 3.0 1.0
14 1673511 2 prior 5 3 11 8.0 1.0 1.0 3.0

In [7]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error


def get_order_count(order, alpha=0.5):
    user_id = order["user_id"]
    df = prior_orders_count[prior_orders_count["user_id"] == user_id]
    feats = [
        "order_number", "product_counts_prev1", "product_counts_prev2",
        "order_dow", "order_hour_of_day", "days_since_prior_order"
    ]
    X = df[feats].values
#     X = np.nan_to_num(X, 0)
    y = df["product_counts"].values

    # create dataset for lightgbm
#     lgb_train = lgb.Dataset(X, y)
#     params = {
#         'task': 'train',
#         'boosting_type': 'gbdt',
#         'objective': 'regression',
#         'metric': {'rmse'},
#         'num_leaves': 100,
#         'learning_rate': 0.01,
#         'feature_fraction': 0.9,
#         'bagging_fraction': 0.8,
#         'bagging_freq': 5,
#         'verbose': 0,
#     }
#     clf = lgb.train(params,
#                     lgb_train,
#                     num_boost_round=40)

    xgb_params = {
        'max_depth': 3,
        'n_estimators': 70,
        'learning_rate': 0.05,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1
    }
    dtrain_all = xgb.DMatrix(X, y)
    clf = xgb.train(xgb_params, dtrain_all, num_boost_round=400)

#     clf = Lasso(alpha=0.01)
#     clf.fit(X, y)

    Xpred = np.array([order[f] or 0 for f in feats]).reshape(1, -1)
#     Xpred = np.nan_to_num(Xpred, 0)

    Xpred = xgb.DMatrix(Xpred)
    return int(round(np.round(clf.predict(Xpred)[0])))

df = train_orders_count.head(100)
df["pred_products_count"] = df.apply(get_order_count, axis=1)

print(mean_squared_error(
    df["product_counts"],
    df["pred_products_count"]
))


25.45
/home/ubuntu/.venv/lib/python2.7/site-packages/ipykernel_launcher.py:55: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [8]:
df = orders[orders.eval_set == 'test']
df = extend_prev_prod_count(df, 1)
df = extend_prev_prod_count(df, 2)

df["pred_products_count"] = df.progress_apply(get_order_count, axis=1)
df.to_csv("test_orders_products_count.csv", index=False, header=True)
df.head()


540/|/  1%|| 540/75000 [02:50<1:32:27, 13.42it/s]
Out[8]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order product_counts_prev1 product_counts_prev2 pred_products_count
0 2774568 3 test 13 5 15 11.0 6.0 5.0 6
1 329954 4 test 6 3 12 30.0 0.0 0.0 0
2 1528013 6 test 4 3 16 22.0 0.0 2.0 2
3 1376945 11 test 8 6 11 8.0 3.0 7.0 5
4 1356845 12 test 6 1 20 30.0 5.0 5.0 5

In [ ]: